{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "name": "Research.ipynb",
   "provenance": [],
   "collapsed_sections": [],
   "toc_visible": true
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "FfUqQK_QKQNN"
   },
   "source": [
    "# 爬取 ProgrammableWeb"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "T-EX5RTJ8zI0"
   },
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import threading\n",
    "\n",
    "mashup_list_base = 'https://www.programmableweb.com/category/all/mashups?deadpool=1&order=created&sort=desc&page='\n",
    "base_url = 'https://www.programmableweb.com'\n",
    "page = 0  # 当前页数\n",
    "max_page = 319  # 指定最大页数"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "x7ayvIMxrhfe",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 357
    },
    "outputId": "0f8406d6-ba5f-4981-8713-be22926f93c0"
   },
   "source": [
    "# 从文件读取hrefs，就不用再爬一遍了\n",
    "href_file_name = 'hrefs-0905-2.txt'\n",
    "def read_hrefs(filename):\n",
    "    hrefs_date = {}\n",
    "    with open(filename, mode='r', encoding='utf-8') as href_file:\n",
    "        for line in href_file:\n",
    "            split = line.split('#####')\n",
    "            hrefs_date[split[0]] = split[1]\n",
    "            # hrefs.append(line)\n",
    "    return hrefs_date\n",
    "actived_hrefs = read_hrefs(href_file_name)\n",
    "hrefs = list(hrefs_date.keys())\n",
    "hrefs[:20]"
   ],
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['https://www.programmableweb.com/mashup/petdoption',\n",
       " 'https://www.programmableweb.com/mashup/landedcostio-consolidated-screening-list',\n",
       " 'https://www.programmableweb.com/mashup/voice-apps',\n",
       " 'https://www.programmableweb.com/mashup/best-gaming-pc-deals',\n",
       " 'https://www.programmableweb.com/mashup/keyword-research-tool',\n",
       " 'https://www.programmableweb.com/mashup/standuply',\n",
       " 'https://www.programmableweb.com/mashup/loanprofy',\n",
       " 'https://www.programmableweb.com/mashup/datantify',\n",
       " 'https://www.programmableweb.com/mashup/penny-parrot',\n",
       " 'https://www.programmableweb.com/mashup/banksnearme',\n",
       " 'https://www.programmableweb.com/mashup/hashtag-suggestions',\n",
       " 'https://www.programmableweb.com/mashup/digireality',\n",
       " 'https://www.programmableweb.com/mashup/autochat-whatsapp-business-api-sandbox',\n",
       " 'https://www.programmableweb.com/mashup/refinansierenet',\n",
       " 'https://www.programmableweb.com/mashup/mars-explorer',\n",
       " 'https://www.programmableweb.com/mashup/bitcoin-digital-downloads-and-terminal-2018',\n",
       " 'https://www.programmableweb.com/mashup/nlsql',\n",
       " 'https://www.programmableweb.com/mashup/field-engineer',\n",
       " 'https://www.programmableweb.com/mashup/vg-sammenlign',\n",
       " 'https://www.programmableweb.com/mashup/proprover']"
      ]
     },
     "metadata": {
      "tags": []
     },
     "execution_count": 8
    }
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "Y1LbTf5D9BZV"
   },
   "source": [
    "hrefs = []  # 详情页列表\n",
    "\n",
    "hrefs_date = {}\n",
    "\n",
    "# 多线程获取详情页的链接\n",
    "def get_detail_links(thread_count, page):\n",
    "    thread_count[0] += 1\n",
    "\n",
    "    mashup_list_page = mashup_list_base + str(page)  # mashup列表页面的URL\n",
    "    req = requests.get(mashup_list_page)\n",
    "    html_text = req.text\n",
    "\n",
    "    bs1 = BeautifulSoup(html_text)\n",
    "    div = bs1.find('div', class_='view-content')\n",
    "    table = div.find('table')\n",
    "    tbody = table.find('tbody')\n",
    "    trs = tbody.find_all('tr')  # 所有tr标签，每一个元素就是一个列表行，后续进一步获取\n",
    "    for tr in trs:  # 遍历每行，获取详情页的链接\n",
    "        td_title = tr.find('td', class_='views-field-title')\n",
    "        td_date = tr.find('td', class_='views-field-created')\n",
    "        a = td_title.find('a')\n",
    "        href1 = a.get('href')\n",
    "        href = base_url + href1  # detail_page\n",
    "        date = td_date.text.strip()\n",
    "\n",
    "        hrefs_date[href] = date\n",
    "\n",
    "        # hrefs.append(href + '#####' + date)  # 加入这个详情页链接到hrefs列表中，方便后续的一次性访问\n",
    "\n",
    "    thread_count[0] -= 1"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "LKJyvaEq9HH3"
   },
   "source": [
    "max_thread = 40\n",
    "thread_count = [0]\n",
    "\n",
    "page_index = 0\n",
    "while True:\n",
    "    if page_index >= max_page:\n",
    "        break\n",
    "    else:\n",
    "        if thread_count[0] < max_thread:\n",
    "            # 创建新新线程\n",
    "            t = threading.Thread(target=get_detail_links, args=(thread_count, page_index,))\n",
    "            t.start()\n",
    "            print('正在获取页码', page_index)\n",
    "            page_index += 1\n"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "RODcZMuLFnjM"
   },
   "source": [
    "len(hrefs)"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "wRLPUOjf1Fg2"
   },
   "source": [
    "# 保存所有链接到文件\n",
    "with open('hrefs-0910-1.txt', mode='w', encoding='utf-8') as href_file:\n",
    "    for href in hrefs_date.keys():\n",
    "        href_file.write(href + '#####' + hrefs_date[href] + '\\n')"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "WE7wLLTIrnfE",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "outputId": "380acabe-ade8-4add-d384-eceb98473ba3"
   },
   "source": [
    "# 现在hrefs_date 是包括了Deadpool的mashup\n",
    "# 和旧的mashup 进行对比然后挑出来没有爬取过的mashup\n",
    "\n",
    "actived_list = list(actived_hrefs.keys())\n",
    "all_list = list(hrefs_date.keys())\n",
    "\n",
    "# deactivated\n",
    "result4 = list(set(all_list).difference(set(actived_list)))\n",
    "len(result4)"
   ],
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "1530"
      ]
     },
     "metadata": {
      "tags": []
     },
     "execution_count": 9
    }
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "_Yakl9k3q_IP"
   },
   "source": [
    "# %% 分割列表的辅助函数\n",
    "def list_split(list_info, per_list_len):\n",
    "    \"\"\"\n",
    "    :param list_info:   列表\n",
    "    :param per_list_len:  每个小列表的长度\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    list_of_group = zip(*(iter(list_info),) * per_list_len)\n",
    "    end_list = [list(i) for i in list_of_group]\n",
    "    count = len(list_info) % per_list_len\n",
    "    end_list.append(list_info[-count:]) if count != 0 else end_list\n",
    "    return end_list"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "XxaPPer8Kdkr",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "outputId": "0dd214cb-a378-4dd7-da19-46d4eeb8bf95"
   },
   "source": [
    "# 9.10\n",
    "hrefs = all_list"
   ],
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "text": [
      "1530\n"
     ],
     "name": "stdout"
    }
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "6QudA114w1eK"
   },
   "source": [
    "# 访问hrefs中的每一个href，最好多线程访问，不然太慢了\n",
    "max_thread = 40\n",
    "thread_count = [0]\n",
    "hrefs_split = list_split(hrefs, 8)\n",
    "mashups = []\n",
    "failed_hrefs = []"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "TGsIwnAQrxr8",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "outputId": "eb52299e-a6ba-40ea-ff2c-630f6ee2a3a6"
   },
   "source": [
    "len(hrefs_split)"
   ],
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "192"
      ]
     },
     "metadata": {
      "tags": []
     },
     "execution_count": 17
    }
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "1gHizCtjq_Pa"
   },
   "source": [
    "import json\n",
    "from data.crawler.data_objects import Api, ApiVersion, Mashup, DeadpoolMashup\n",
    "\n"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "TIz-8yGsq_Sv",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# 2020.09.04 New requirements\n",
    "- 爬取与Mashup的相关的API的一些信息，不仅仅是API的名字\n",
    "- 创建一个字典保存API的URL->API对象的映射。\n",
    "- 下次发现使用了相同的API的时候就不需要额外爬取一遍了\n",
    "- 与API爬虫有重复爬取的部分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "apis = {}\n",
    "failed_api_hrefs = []\n",
    "def get_api(url):\n",
    "    if url in apis:\n",
    "        print('请求的API ' + url + ' 直接返回')\n",
    "        return apis[url]\n",
    "    else:\n",
    "        print('请求的API ' + url + ' 正在获取')\n",
    "        # 单线程爬吧，每个mashup相关的api数量也不是特别多\n",
    "        req_api = requests.get('https://www.programmableweb.com' + url)\n",
    "        bs_api = BeautifulSoup(req_api.text, features='html.parser')\n",
    "        # 找node-header，锁定标题\n",
    "        node_header = bs_api.find('div', class_='node-header')\n",
    "        if node_header is None:\n",
    "            # 失败\n",
    "            failed_api_hrefs.append(url)\n",
    "            print('failed')\n",
    "            return None\n",
    "        title = node_header.find('h1').text.strip()     # title\n",
    "\n",
    "        tags = []   #tags\n",
    "        div_tags = bs_api.find('div', class_='tags')\n",
    "        tags_a = div_tags.find_all('a')\n",
    "        for a in tags_a:\n",
    "            tags.append(a.text.strip())\n",
    "\n",
    "        intro = bs_api.find('div', class_='intro')\n",
    "        description = ''\n",
    "        if intro is None:\n",
    "            # 失败\n",
    "            failed_api_hrefs.append(url)\n",
    "            print('not found intro')\n",
    "        else:\n",
    "            description_div = intro.find('div', class_='api_description')\n",
    "            if description_div is None:\n",
    "                failed_api_hrefs.append(url)\n",
    "                print('not found description_div')\n",
    "            else:\n",
    "                description = description_div.text.strip()  #description\n",
    "\n",
    "        # 定位版本versions\n",
    "        versions = []\n",
    "        version_field = bs_api.find('div', id='version-details-field')\n",
    "        version_results = version_field.find_all('div', class_='version-result-set')\n",
    "\n",
    "        # 开始获取版本\n",
    "        for result in version_results:\n",
    "            divs = result.find_all('div')\n",
    "            version_title = ''\n",
    "            version_style = ''\n",
    "            version_status = ''\n",
    "            version_version = ''\n",
    "            version_submit_date = ''\n",
    "            for div in divs:\n",
    "                # 直接找所有div，根据label去匹配\n",
    "                label = div.find('label')\n",
    "                if label is None:\n",
    "                    # 不是我要的div\n",
    "                    continue\n",
    "                else:\n",
    "                    lt = label.text.strip()\n",
    "                    version_info = div.text.strip()\n",
    "                    if lt == 'Title:':\n",
    "                        version_title = version_info.split(':')[1]\n",
    "                    elif lt == 'Style:':\n",
    "                        version_style = version_info.split(':')[1]\n",
    "                    elif lt == 'Version:':\n",
    "                        version_version = version_info.split(':')[1]\n",
    "                    elif lt == 'Status:':\n",
    "                        # 这个实在没发现什么好的办法，只能碰碰运气了\n",
    "                        spans = div.find_all('span')\n",
    "                        # 因为0是那个mobile，我不需要，只需要1\n",
    "                        span = spans[1]\n",
    "                        version_status = span.text.strip()\n",
    "                    elif lt == 'Submitted:':\n",
    "                        version_submit_date = version_info.split(':')[1]\n",
    "            version = ApiVersion(version_title, version_style, version_version, version_status, version_submit_date)\n",
    "            versions.append(version)\n",
    "\n",
    "        # 创建API对象\n",
    "        api = Api(title, tags, description, url, versions)\n",
    "        apis[url] = api\n",
    "        print(api.title)\n",
    "        print('请求的API ' + url + ' 获取完成！')\n",
    "        return api\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "6NSCM37o0BCG"
   },
   "source": [
    "# 下面的函数最后修改于2020.09.04 22:49\n",
    "def get_mashup(url, is_dead_mashup=False):\n",
    "    # -------------\n",
    "        # 逐步获取关键数据\n",
    "        req1 = requests.get(href)   # 网络请求\n",
    "        bs2 = BeautifulSoup(req1.text, features='html.parser')  # soup 解析html\n",
    "        \n",
    "        # 获取标题\n",
    "        node_header = bs2.find('div', class_='node-header')\n",
    "        if node_header is None:\n",
    "            print('href \"' + href + '\" does not contain class=node-header.')\n",
    "            failed_hrefs.append(href)\n",
    "            return None\n",
    "        title = node_header.find('h1').text   #标题\n",
    "\n",
    "        # 获取tags\n",
    "        tags = []\n",
    "        # tags_div = node_header.find('div', class_='tags')\n",
    "        # tags_a = tags_div.find_all('a')\n",
    "        # for a_tag in tags_a:\n",
    "        #     # a标签\n",
    "        #     tags.append(a_tag.text.strip())\n",
    "\n",
    "        # 获取description\n",
    "        intro = bs2.find('div', class_='intro')\n",
    "        description_div = intro.find('div', class_='field-item')\n",
    "        description = ''\n",
    "        if description_div is None:\n",
    "            # 没有找到description的div，也是爬取失败了\n",
    "            print(url + ' not found class=field-item (description_div)')\n",
    "        else:\n",
    "            description = description_div.text  # description\n",
    "        \n",
    "        # 获取Spec\n",
    "        tabs_content = bs2.find('div', id='tabs-content')\n",
    "        fields = tabs_content.find_all('div', class_='field')\n",
    "        # 获取完成\n",
    "        # --------------\n",
    "        related_apis = []\n",
    "        categories = []\n",
    "        url = 'Undefined'\n",
    "        mashup_type = 'Undefined'\n",
    "        for field in fields:\n",
    "            label = field.find('label').text.strip()  \n",
    "            # 通过label判断而不是按照顺序，防止某些页面的顺序不一致而出现问题\n",
    "            if label == 'Related APIs':\n",
    "                # related apis\n",
    "                related_apis = [get_api(a0.get('href')) for a0 in field.find_all('a')]\n",
    "            elif label == 'Categories':\n",
    "                # 分类\n",
    "                categories = [a0.text for a0 in field.find_all('a')]\n",
    "            elif label == 'URL':\n",
    "                # 获取URL\n",
    "                url_a = field.find('a')\n",
    "                if url_a is None:\n",
    "                    # URL 为空\n",
    "                    print(title + \"'s url is Undefined\")\n",
    "                else:\n",
    "                    url = url_a.get('href')\n",
    "            elif label == 'Mashup/App Type':\n",
    "                # 获取type\n",
    "                mashup_type_span = field.find('span')\n",
    "                if mashup_type_span is None:\n",
    "                    # type 为空\n",
    "                    print(title + \"'s type is Undefined\")\n",
    "                else:\n",
    "                    mashup_type = mashup_type_span.text.strip()\n",
    "\n",
    "        # 判断一下是不是deadpooled的mashup\n",
    "        if is_dead_mashup:\n",
    "            # 获取changelog\n",
    "            changelogs = get_changelogs(url)\n",
    "            deprecated_date = 'Not sure'\n",
    "            if len(changelogs) >= 1:\n",
    "                deprecated_date = changelogs[0].split('|')[0]\n",
    "            mashup = DeadpoolMashup(title, tags, description, related_apis, categories, url, mashup_type, changelogs, deprecated_date, hrefs_date[href])\n",
    "        else:\n",
    "            # 创建mashup对象\n",
    "            mashup = Mashup(title, tags, description, related_apis, categories, url, mashup_type, hrefs_date[href])\n",
    "        return mashup\n",
    "\n",
    "\n",
    "def get_detail_info(thread_count, hrefs):\n",
    "    \"\"\"\n",
    "    遍历传入的hrefs列表，访问它的每一个href，\n",
    "    然后获取这个页面中的 标题 title、描述 description、相关API related_apis\n",
    "    并且创建Mashup对象，加入mashups的列表中。\n",
    "    :param thread_count: 当前已启用的线程数量\n",
    "    :param hrefs: 链接列表，每个元素都是一个链接，在这个函数中会被遍历，发起网络请求，爬取返回的html中的内容\n",
    "    :return: none\n",
    "    \"\"\"\n",
    "    thread_count[0] += 1    # 已启用线程数加1\n",
    "    for href in hrefs:\n",
    "        mashup = get_mashup(href)\n",
    "        mashups.append(mashup)\n",
    "    thread_count[0] -= 1    # 已启用线程数减1"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "bvhKq6ZKMXFd"
   },
   "source": [
    "def get_changelogs(url):\n",
    "    changelogs = []\n",
    "    change_req = requests.get(url + '/changelog')\n",
    "    cbs = BeautifulSoup(change_req.text, features='html.parser')    # changelog beautiful soup\n",
    "    changelog_divs = cbs.find_all('p', class_='changelog_divider')\n",
    "    for changelog_div in changelog_divs:\n",
    "        changelog_text = changelog_div.get_text(\"|\", strip=True)\n",
    "        if 'deadpool' in changelog_text.lower():\n",
    "            print(True)\n",
    "            # 有用的changelog\n",
    "            changelogs.append(changelog_text)\n",
    "    return changelogs"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "K8lrMvHK9dAg"
   },
   "source": [
    "failed_hrefs = []"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "jM6gRCX734ox"
   },
   "source": [
    "mashups = []"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "AuvbTs9iq_Wm",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "outputId": "d5c195b1-435d-4040-8f2a-357ec9457268"
   },
   "source": [
    "# 开启多线程访问hrefs\n",
    "i = 0\n",
    "length = len(hrefs_split[:1])\n",
    "while True:\n",
    "    if i >= length:\n",
    "        break\n",
    "    if thread_count[0] < max_thread:\n",
    "        if i % 100 == 0:\n",
    "            print('i =', i)\n",
    "        # 创建新新线程\n",
    "        t = threading.Thread(target=get_detail_info, args=(thread_count, hrefs_split[i]))\n",
    "        t.start()\n",
    "        print('已创建新线程。当前处理第', (i + 1), '个子列表，总共有', length, '个子列表')\n",
    "        i += 1"
   ],
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "text": [
      "i = 0\n",
      "已创建新线程。当前处理第 1 个子列表，总共有 1 个子列表\n"
     ],
     "name": "stdout"
    }
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "oD5kMrjvq_br",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 136
    },
    "outputId": "89bc599f-7ec8-4d4c-9442-ce872da5c6c8"
   },
   "source": [
    "# %% 写文件\n",
    "# 将mashups写入文件，作为数据集\n",
    "file = open('data0906-1-temp.txt', mode='w', encoding='utf-8')\n",
    "file.write('Dataset Size(Mashup Count): ' + str(len(mashups)) + '\\n')\n",
    "\n",
    "for mashup in mashups:\n",
    "\n",
    "    bbb = mashup.json()\n",
    "\n",
    "    # print(bbb)\n",
    "    file.write(bbb)\n",
    "file.flush()\n",
    "file.close()"
   ],
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "text": [
      "请求完成: https://www.programmableweb.com/mashup/smarkets\n",
      "related api\n",
      "请求的API /api/amazon-product-advertising 直接返回\n",
      "categories\n",
      "url\n",
      "https://www.programmableweb.com/mashup/smarkets 完成\n",
      "正在获取: https://www.programmableweb.com/mashup/pumps.ie\n"
     ],
     "name": "stdout"
    }
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "GJn7RQGjeNuL"
   },
   "source": [
    "# 写文件\n",
    "with open('data0906-1.txt', mode='w', encoding='utf-8') as file:\n",
    "    file.write(json.dumps(obj=mashups, default=lambda x : x.__dict__, sort_keys=False, indent=2))"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Mashups accessibility"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def dict2mashup(mashup_dict, is_deadpool_mashup=False):\n",
    "    # 转为dict\n",
    "    title = mashup_dict['title']\n",
    "    descrip = mashup_dict['description']\n",
    "    categories = mashup_dict['categories']"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "mashup_file = 'data0906-1.txt'\n",
    "deadpool_mashup_file = 'deadpool_mashups-0910-1.txt'\n",
    "mashups1 = {}\n",
    "deadpool_mashups = {}\n",
    "with open(deadpool_mashup_file, 'r', encoding='utf-8') as file:\n",
    "    text = file.read()\n",
    "    print(len(text))\n",
    "    mashup_dict_list = json.loads(text)\n",
    "    print(len(mashup_dict_list))\n",
    "    for m_dict in mashup_dict_list:\n",
    "        if m_dict is not None:\n",
    "            mashups1[m_dict['url']] = m_dict\n",
    "print(len(mashups1))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# 定义一个类存储东西\n",
    "class MashupVisit:\n",
    "    def __init__(self, title, url, status_code, is_accessible):\n",
    "        self.title = title\n",
    "        self.url = url\n",
    "        self.status_code = status_code\n",
    "        self.is_accessible = is_accessible\n",
    "\n",
    "    def json(self):\n",
    "        return json.dumps(obj=self, default=lambda x : x.__dict__, sort_keys=False, indent=2)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# 尝试访问所有URL，记录访问状态\n",
    "visits = []\n",
    "for url in list(mashups1.keys())[-10:-1]:\n",
    "    print(url)\n",
    "    try:\n",
    "        req = requests.get(url)\n",
    "        status_code = req.status_code\n",
    "    except:\n",
    "        status_code = -1\n",
    "    is_accessible = False\n",
    "    mashup_title = mashups1[url]['title']\n",
    "    if status_code == 200 or status_code == 429:\n",
    "        bs = BeautifulSoup(req.text, features='html.parser')\n",
    "        # 先判断这个网页中是否包含\"mashup\"字样，如果包含，就进一步判断标题\n",
    "        html_text = bs.text\n",
    "        if 'mashup' in html_text.lower():\n",
    "            title = bs.find('head').find('title')\n",
    "            if title is not None:\n",
    "                if 'for sale' not in title.text.lower():\n",
    "                    # For Sale 很可能说明这个域名已经跪了，都正在出售了\n",
    "                    # False\n",
    "                    is_accessible = True\n",
    "            else:\n",
    "                # 标题为空，暂时标记为True吧\n",
    "                is_accessible = True\n",
    "        else:\n",
    "            # mashup 没有被包含在整个网页的内容中\n",
    "            is_accessible = False\n",
    "\n",
    "    visit = MashupVisit(mashup_title, url, status_code, is_accessible)\n",
    "    visits.append(visit)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def visit_mashup(url):\n",
    "    try:\n",
    "        req = requests.get(url)\n",
    "        status_code = req.status_code\n",
    "    except:\n",
    "        status_code = -1\n",
    "    is_accessible = False\n",
    "    mashup_title = mashups1[url]['title']\n",
    "    if status_code == 200 or status_code == 429:\n",
    "        bs = BeautifulSoup(req.text, features='html.parser')\n",
    "        # 不可访问或报错直接False\n",
    "        # 如果可访问，先判断网页HTML代码中中是否包含这个Mashup的名字\n",
    "        # 如果包含名字，进一步判断是否for sale，for sale 就False\n",
    "        # 如果不包含名字，就把名字拆成单词看在不在网页中出现，有一个单词出现就标为True\n",
    "        # 一个单词都不包含的话，就判断内容是否包含 mashup 字眼，有就True\n",
    "        # 如果还不包含mashup字眼，就标记为False\n",
    "        html_text = req.text\n",
    "        html_text_lower = html_text.lower()\n",
    "        if mashup_title.replace('Mashup:', '').strip().lower() in html_text_lower:\n",
    "            # 此时可以直接标为True\n",
    "            is_accessible = True\n",
    "            # 包含名字\n",
    "\n",
    "            # 判断 for sale 与否\n",
    "            head = bs.find('head')\n",
    "            if head is not None:    # 如果没有标题，就不判断啊是否for sale 了\n",
    "                title = head.find('title')\n",
    "                if title is not None:\n",
    "                    # 含有 title 字段\n",
    "                    if 'for sale' in title.text.lower():\n",
    "                        # For Sale 很可能说明这个域名已经跪了，都正在出售了\n",
    "                        # False\n",
    "                        is_accessible = False\n",
    "        else:\n",
    "            # 把标题按照空格拆分，判断是否至少包含一个单词\n",
    "            page_lower = bs.text.lower()\n",
    "            title_splits = mashup_title.lower().split(' ')\n",
    "            exists = False\n",
    "            for element in title_splits:\n",
    "                if element in page_lower:\n",
    "                    exists = True\n",
    "                    break\n",
    "            if exists:\n",
    "                is_accessible = True\n",
    "            elif 'mashup' in html_text_lower:   # 不包含名字，检查是否包含mashup 字眼\n",
    "                # 包含\n",
    "                is_accessible = True\n",
    "            else:\n",
    "                # 不包含\n",
    "                is_accessible = False\n",
    "\n",
    "    visit = MashupVisit(mashup_title, url, status_code, is_accessible)\n",
    "    return visit\n",
    "\n",
    "def threads_visit_mashups(thread_count, mashup_links):\n",
    "    thread_count[0] += 1\n",
    "    for url in mashup_links:\n",
    "        visits.append(visit_mashup(url))\n",
    "    thread_count[0] -= 1\n",
    "\n",
    "urls = list(mashups1.keys())"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def list_split(list_info, per_list_len):\n",
    "    list_of_group = zip(*(iter(list_info),) * per_list_len)\n",
    "    end_list = [list(i) for i in list_of_group]  # i is a tuple\n",
    "    count = len(list_info) % per_list_len\n",
    "    end_list.append(list_info[-count:]) if count != 0 else end_list\n",
    "    return end_list"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "max_thread = 100\n",
    "thread_count = [0]\n",
    "now_list = list_split(urls, 8)\n",
    "visits = []\n",
    "i = 0\n",
    "length = len(now_list)\n",
    "while True:\n",
    "    if i >= length:\n",
    "        break\n",
    "    if thread_count[0] < max_thread:\n",
    "        # 创建新新线程\n",
    "        t = threading.Thread(target=threads_visit_mashups, args=(thread_count, now_list[i]))\n",
    "        t.start()\n",
    "        print('已创建新线程。当前处理第', (i + 1), '个子列表，总共有', length, '个子列表')\n",
    "        i += 1\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "with open('mashup_accessibility.txt', 'w', encoding='utf-8') as file:\n",
    "    file.write(json.dumps(obj=visits, default=lambda x : x.__dict__, sort_keys=False, indent=2))\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ]
}